# Required Packages
import pandas as pd
import numpy as np
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## missingno
import missingno as msno
## seaborn
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import matplotlib.colors
from pylab import rcParams
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (17, 6)
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['text.color'] = 'k'
%matplotlib inline
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
%config InlineBackend.figure_format = 'retina'
import warnings
warnings.filterwarnings("ignore")
fig, ax = plt.subplots(1, 1, figsize= (16,2))
c1 = 'Maroon'
c2 = 'LightCoral'
edc = 'DarkRed'
_ = ax.text(0.4, .6, "Lending Club", color = c1,
fontdict = {'family': 'fantasy','color': 'darkred','weight': 'normal', 'size': 50},
rotation=0.,ha="center", va="center", bbox=dict(boxstyle="round", ec= edc, fc=c2))
_ = ax.text(.745, .25, "Dataset", color = c2,
fontdict = {'family': 'fantasy','color': 'darkred','weight': 'normal', 'size': 50},
rotation=0.,ha="center", va="center", bbox=dict(boxstyle="roundtooth", ec= edc, fc=c1))
_ = ax.axis('tight')
_ = ax.axis('off')
In this article, we analyze and create a predictive model for Lending Club Data which is available as a part of the creditmodel: Toolkit for Credit Modeling, Analysis and Visualization R Package. Similar datasets also can be accessed from the lendingclub website as well.
This data contains complete loan data for all loans issued through the period stated, including the current loan status (Current, Late, Fully Paid, etc.) and latest payment information. The data containing loan data through the "present" contains complete loan data for all loans issued through the previous completed calendar quarter(period: 2018Q1:2018Q4).
data_dict = {'id': 'A unique LC assigned ID for the loan listing',
'issue_d': 'The month which the loan was funded',
'loan_status': 'Current status of the loan',
'addr_state': 'The state provided by the borrower in the loan application',
'acc_open_past_24mths': 'Number of trades opened in past 24 months',
'all_util': 'Balance to credit limit on all trades',
'annual_inc': 'The self:reported annual income provided by the borrower during registration',
'avg_cur_bal': 'Average current balance of all accounts',
'bc_open_to_buy': 'Total open to buy on revolving bankcards',
'bc_util': 'Ratio of total current balance to high credit/credit limit for all bankcard accounts',
'dti': """A ratio calculated using the borrower's total monthly debt payments on the total debt obligations, excluding mortgage and the requested LC loan, divided by the borrower's self:reported monthly income""",
'dti_joint': """A ratio calculated using the co:borrowers' total monthly payments on the total debt obligations, excluding mortgages and the requested LC loan, divided by the co:borrowers' combined self:reported monthly income""",
'emp_length': """Employment length in years Possible values are between 0 and 10 where 0 means less than one year and 10 means ten or more years""",
'emp_title':'The job title supplied by the Borrower when applying for the loan',
'funded_amnt_inv': 'The total amount committed by investors for that loan at that point in time',
'grade': 'LC assigned loan grade',
'inq_last_12m': 'Number of credit inquiries in past 12 months',
'installment': 'The monthly payment owed by the borrower if the loan originates',
'max_bal_bc': 'Maximum current balance owed on all revolving accounts',
'mo_sin_old_il_acct': 'Months since oldest bank installment account opened',
'mo_sin_old_rev_tl_op': 'Months since oldest revolving account opened',
'mo_sin_rcnt_rev_tl_op': 'Months since most recent revolving account opened',
'mo_sin_rcnt_tl': 'Months since most recent account opened',
'mort_acc': 'Number of mortgage accounts',
'pct_tl_nvr_dlq': 'Percent of trades never delinquent',
'percent_bc_gt_75': 'Percentage of all bankcard accounts > 75',
'purpose': 'A category provided by the borrower for the loan request',
'sub_grade': 'LC assigned loan subgrade',
'term': 'The number of payments on the loan Values are in months and can be either 36 or 60',
'tot_cur_bal': 'Total current balance of all accounts',
'tot_hi_cred_lim': 'Total high credit/credit limit',
'total_acc': """The total number of credit lines currently in the borrower's credit file""",
'total_bal_ex_mort': 'Total credit balance excluding mortgage',
'total_bc_limit': 'Total bankcard high credit/credit limit',
'total_cu_tl': """Number of finance trades""",
'total_il_high_credit_limit': """Total installment high credit/credit limit""",
'verification_status_joint': """Indicates if the co:borrowers' joint income was verified by LC, not verified, or if the income source was verified""",
'zip_code': """The first 3 numbers of the zip code provided by the borrower in the loan application"""}
Data = pd.read_csv('LC_Data/lendingclub.csv', index_col = 0)
Data.head(10)
The dataset dictionary also can be found here.
| Feature | Description |
|---|---|
| id | A unique LC assigned ID for the loan listing |
| issue_d | The month which the loan was funded |
| loan_status | Current status of the loan |
| addr_state | The state provided by the borrower in the loan application |
| acc_open_past_24mths | Number of trades opened in past 24 months |
| all_util | Balance to credit limit on all trades |
| annual_inc | The self:reported annual income provided by the borrower during registration |
| avg_cur_bal | Average current balance of all accounts |
| bc_open_to_buy | Total open to buy on revolving bankcards |
| bc_util | Ratio of total current balance to high credit/credit limit for all bankcard accounts |
| dti | A ratio calculated using the borrower's total monthly debt payments on the total debt obligations, excluding mortgage and the requested LC loan, divided by the borrower's self:reported monthly income |
| dti_joint | A ratio calculated using the co:borrowers' total monthly payments on the total debt obligations, excluding mortgages and the requested LC loan, divided by the co:borrowers' combined self:reported monthly income |
| emp_length | Employment length in years Possible values are between 0 and 10 where 0 means less than one year and 10 means ten or more years |
| emp_title | The job title supplied by the Borrower when applying for the loan |
| funded_amnt_inv | The total amount committed by investors for that loan at that point in time |
| grade | LC assigned loan grade |
| inq_last_12m | Number of credit inquiries in past 12 months |
| installment | The monthly payment owed by the borrower if the loan originates |
| max_bal_bc | Maximum current balance owed on all revolving accounts |
| mo_sin_old_il_acct | Months since oldest bank installment account opened |
| mo_sin_old_rev_tl_op | Months since oldest revolving account opened |
| mo_sin_rcnt_rev_tl_op | Months since most recent revolving account opened |
| mo_sin_rcnt_tl | Months since most recent account opened |
| mort_acc | Number of mortgage accounts |
| pct_tl_nvr_dlq | Percent of trades never delinquent |
| percent_bc_gt_75 | Percentage of all bankcard accounts > 75 |
| purpose | A category provided by the borrower for the loan request |
| sub_grade | LC assigned loan subgrade |
| term | The number of payments on the loan Values are in months and can be either 36 or 60 |
| tot_cur_bal | Total current balance of all accounts |
| tot_hi_cred_lim | Total high credit/credit limit |
| total_acc | The total number of credit lines currently in the borrower's credit file |
| total_bal_ex_mort | Total credit balance excluding mortgage |
| total_bc_limit | Total bankcard high credit/credit limit |
| total_cu_tl | umber of finance trades |
| total_il_high_credit_limit | Total installment high credit/credit limit |
| verification_status_joint | Indicates if the co:borrowers' joint income was verified by LC, not verified, or if the income source was verified |
| zip_code | The first 3 numbers of the zip code provided by the borrower in the loan application |
def Colors (x, N = 6):
C = sns.color_palette("RdYlGn", N)
r = np.linspace(0, 100, N)
for i in range(N-1):
if r[i] <= x < r[i+1]:
return C[i]
else:
return C[-1]
def Data_info(Inp, Plot = True, FS = (16,6)):
Out = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
Out = Out.join(Inp.isnull().sum().to_frame(name = 'Number of NaN Values'), how='outer')
Out ['Size'] = Inp.shape[0]
Out['Percentage'] = 100 - np.round(100*(Out['Number of NaN Values']/Inp.shape[0]),2)
Out.index.name = 'Features'
Clr = Out['Percentage'].map(lambda x: Colors(x)).tolist()
fig, ax = plt.subplots(1, 1, figsize= FS)
Temp = Out['Percentage'].to_frame()
_ = Temp.plot(kind='bar', width = 1, color= [Clr], edgecolor = 'white', hatch = '///', legend=None, alpha=0.75, ax = ax)
_ = Temp.plot(kind='bar', width = 1, color='None', edgecolor = 'Black', legend=None, alpha=1, lw=1., ax = ax)
_ = ax.set_ylim(0, 101)
_= plt. grid(b=False, which='both', axis='both')
_= plt. grid(b=True, which='major', axis='y')
return Out
def dtypes_group(Inp):
Temp = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
Out = pd.DataFrame(index =Temp['Data Type'].unique(), columns = ['Features','Count'])
for c in Temp['Data Type'].unique():
Out.loc[Out.index == c, 'Features'] = [Temp.loc[Temp['Data Type'] == c].index.tolist()]
Out.loc[Out.index == c, 'Count'] = len(Temp.loc[Temp['Data Type'] == c].index.tolist())
Out.index.name = 'Data Type'
Out = Out.reset_index(drop = False)
Out['Data Type'] = Out['Data Type'].astype(str)
return Out
display(dtypes_group(Data).style.hide_index())
_ = Data_info(Data)
Feat= 'addr_state'
Group = Data[Feat].value_counts().to_frame('Count').sort_index().reset_index().rename(columns = {'index': Feat})
fig = go.Figure(data=go.Choropleth(locations= Group[Feat],
z = Group['Count'].astype(float),
locationmode = 'USA-states',
colorscale = 'PuBu'))
fig.update_layout(title={'text': 'Number of Loan Listing by State', 'x':0.46, 'y':0.9, 'xanchor': 'center', 'yanchor': 'top'},
geo_scope='usa')
fig.show()
display(Group.sort_values(by=['Count'],
ascending=False).set_index(Feat).head(25).T.style.hide_index().background_gradient(cmap='Blues'))
Feat= 'addr_state'
Group = Data.groupby([Feat,'loan_status'])[Feat].agg('count').to_frame('Count').reset_index()
fig = go.Figure(data=go.Choropleth(locations= Group.loc[Group['loan_status'] == 1, Feat],
z = Group.loc[Group['loan_status'] == 1, 'Count'].astype(float),
locationmode = 'USA-states',
colorscale = 'Greens'))
fig.update_layout(title={'text': 'Number of Approved Loans by State',
'x':0.46, 'y':0.9, 'xanchor': 'center', 'yanchor': 'top'},
geo_scope='usa')
fig.show()
fig = go.Figure(data=go.Choropleth(locations= Group.loc[Group['loan_status'] == 0, Feat],
z = Group.loc[Group['loan_status'] == 0, 'Count'].astype(float),
locationmode = 'USA-states',
colorscale = 'Reds'))
fig.update_layout(title={'text': 'Number of Not Approved Loans by State',
'x':0.46, 'y':0.9, 'xanchor': 'center', 'yanchor': 'top'},
geo_scope='usa')
fig.show()
Group.columns = [x.replace('_',' ').title().replace('Addr', 'Address') for x in Group.columns]
Group['Percentage'] = np.round(100* Group['Count'] /Group['Count'].sum(), 2)
Group['Loan Status'] = Group['Loan Status'].map(lambda x: 'Approved' if x == 0 else 'Not Approved')
Colors = ['LightGreen','LightSalmon']
LineColor = 'Black'
fig = px.bar(Group, x= 'Address State', y= 'Percentage', orientation='v',
color = 'Loan Status', text = 'Percentage', hover_data= Group.columns, color_discrete_sequence= Colors,
height= 500)
fig.update_traces(marker_line_color=LineColor, marker_line_width=.8, opacity=1,
texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(plot_bgcolor= 'white', uniformtext_minsize= 8, uniformtext_mode='hide',
title={'text': 'Number of Applications from Each State and the Application Approval Status',
'x':0.46, 'y':0.92, 'xanchor': 'center', 'yanchor': 'top'})
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(range=[0, 16],
showgrid=True, gridwidth=1, gridcolor='Lightgray',
showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
def mytitle(x):
x = x.replace('_',' ')
x = x.title()
Dict ={' A ':' a ', ' An ':' an ', ' The': ' the ', ' Of ': ' of ', ' In ': ' in '}
for c in Dict.keys():
x = x.replace(c, Dict[c])
return x
Feat= 'acc_open_past_24mths'
Group = Data.groupby([Feat,'loan_status'])[Feat].agg('count').to_frame('Count').reset_index()
Feat = mytitle(data_dict[Feat])
Temp = [Feat, 'Loan Status', 'Count']
Group.columns = Temp
del Temp
Group['Percentage'] = np.round(100* Group['Count'] /Group['Count'].sum(), 2)
Group['Loan Status'] = Group['Loan Status'].map(lambda x: 'Approved' if x == 0 else 'Not Approved')
Colors = ['LightGreen','LightSalmon']
LineColor = 'Black'
fig = px.bar(Group, x= Feat, y= 'Percentage', orientation='v',
color = 'Loan Status', text = 'Percentage', hover_data= Group.columns, color_discrete_sequence= Colors,
height= 500)
fig.update_traces(marker_line_color=LineColor, marker_line_width=.8, opacity=1,
texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(plot_bgcolor= 'white', uniformtext_minsize= 8, uniformtext_mode='hide',
title={'text': 'Number of Trades Opened in Past 24 Months by Borrowers and the Application Approval Status',
'x':0.46, 'y':0.92, 'xanchor': 'center', 'yanchor': 'top'})
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(range=[0, 14],
showgrid=True, gridwidth=1, gridcolor='Lightgray',
showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
Group = Data[['annual_inc', 'loan_status']]
Group.columns = ['Annual Income', 'Loan Status']
Group['Loan Status'] = Group['Loan Status'].map(lambda x: 'Approved' if x == 0 else 'Not Approved')
C = ['LightGreen','LightSalmon']
LineColor = 'Black'
fig = px.histogram(Group, x = 'Annual Income', color='Loan Status',
marginal= 'box', color_discrete_sequence= C, hover_data= Group.columns)
fig.update_traces(marker_line_color= LineColor, marker_line_width=0.2, opacity=1)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray',
showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_xaxes(range=[0, .4e6],
showgrid=True, gridwidth=1, gridcolor='Lightgray',
showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig['layout']['yaxis'].update(range=[0, 2.5e3])
fig.update_layout(plot_bgcolor= 'white',
title={'text': 'Borrowers Annual Income Distribution and the Application Approval Status',
'x':0.46, 'y':0.95, 'xanchor': 'center', 'yanchor': 'top'}, yaxis_title='Frequency')
fig.show()
Group = Data[['dti', 'loan_status']]
Group.columns = ['DTI', 'Loan Status']
Group['Loan Status'] = Group['Loan Status'].map(lambda x: 'Approved' if x == 0 else 'Not Approved')
C = ['LightGreen','LightSalmon']
LineColor = 'Black'
fig = px.histogram(Group, x = 'DTI', color='Loan Status',
marginal= 'box', color_discrete_sequence= C, hover_data= Group.columns)
fig.update_traces(marker_line_color= LineColor, marker_line_width=0.2, opacity=1)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray',
showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_xaxes(range=[0, 1.5e2],
showgrid=True, gridwidth=1, gridcolor='Lightgray',
showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig['layout']['yaxis'].update(range=[0, 1.4e3])
fig.update_layout(plot_bgcolor= 'white',
title={'text': 'Borrowers DTI Score and the Application Approval Status',
'x':0.46, 'y':0.95, 'xanchor': 'center', 'yanchor': 'top'}, yaxis_title='Frequency')
fig.show()
Feat= 'emp_length'
Group = Data.groupby([Feat,'loan_status'])[Feat].agg('count').to_frame('Count').reset_index()
Feat = 'Employment Length'
Temp = [Feat, 'Loan Status', 'Count']
Group.columns = Temp
del Temp
Group['Percentage'] = np.round(100* Group['Count'] /Group['Count'].sum(), 2)
Group['Loan Status'] = Group['Loan Status'].map(lambda x: 'Approved' if x == 0 else 'Not Approved')
Colors = ['LightGreen','LightSalmon']
LineColor = 'Black'
fig = px.bar(Group, x= Feat, y= 'Percentage', orientation='v',
color = 'Loan Status', text = 'Percentage', hover_data= Group.columns, color_discrete_sequence= Colors,
height= 500, width = 750)
fig.update_traces(marker_line_color=LineColor, marker_line_width=.8, opacity=1,
texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(plot_bgcolor= 'white', uniformtext_minsize= 8, uniformtext_mode='hide',
title={'text': 'Borrowers Employment Length and the Application Approval Status',
'x':0.46, 'y':0.94, 'xanchor': 'center', 'yanchor': 'top'})
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(range=[0, 40],
showgrid=True, gridwidth=1, gridcolor='Lightgray',
showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
Feat= 'grade'
Group = Data.groupby([Feat,'loan_status'])[Feat].agg('count').to_frame('Count').reset_index()
Feat = Feat.title()
Temp = [Feat, 'Loan Status', 'Count']
Group.columns = Temp
del Temp
Group['Percentage'] = np.round(100* Group['Count'] /Group['Count'].sum(), 2)
Group['Loan Status'] = Group['Loan Status'].map(lambda x: 'Approved' if x == 0 else 'Not Approved')
Colors = ['LightGreen','LightSalmon']
LineColor = 'Black'
fig = px.bar(Group, x= Feat, y= 'Percentage', orientation='v',
color = 'Loan Status', text = 'Percentage', hover_data= Group.columns, color_discrete_sequence= Colors,
height= 500, width = 600)
fig.update_traces(marker_line_color=LineColor, marker_line_width=.8, opacity=1,
texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(plot_bgcolor= 'white', uniformtext_minsize= 8, uniformtext_mode='hide',
title={'text': 'Borrowers Grade and the Application Approval Status',
'x':0.46, 'y':0.94, 'xanchor': 'center', 'yanchor': 'top'})
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(range=[0, 30],
showgrid=True, gridwidth=1, gridcolor='Lightgray',
showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
Feat= 'purpose'
Group = Data.groupby([Feat,'loan_status'])[Feat].agg('count').to_frame('Count').reset_index()
Feat = Feat.title()
Temp = [Feat, 'Loan Status', 'Count']
Group.columns = Temp
Group[Feat] = Group[Feat].map(lambda x: mytitle(x))
del Temp
Group['Percentage'] = np.round(100* Group['Count'] /Group['Count'].sum(), 2)
Group['Loan Status'] = Group['Loan Status'].map(lambda x: 'Approved' if x == 0 else 'Not Approved')
Colors = ['LightGreen','LightSalmon']
LineColor = 'Black'
fig = px.bar(Group, x= Feat, y= 'Percentage', orientation='v',
color = 'Loan Status', text = 'Percentage', hover_data= Group.columns, color_discrete_sequence= Colors,
height= 500)
fig.update_traces(marker_line_color=LineColor, marker_line_width=.8, opacity=1,
texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(plot_bgcolor= 'white', uniformtext_minsize= 8, uniformtext_mode='hide',
title={'text': 'Purpose of the Loan and the Application Approval Status',
'x':0.46, 'y':0.92, 'xanchor': 'center', 'yanchor': 'top'})
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(range=[0, 60],
showgrid=True, gridwidth=1, gridcolor='Lightgray',
showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()